#!/usr/bin/perl

# Written by Elihu Ihms (ihms@steelsnowflake.com)

# Changelog:
# 01.25.2010	written
# 01.27.2010	fixed minimization function, added help
# 03.10.2010	fixed to deal with negative numbers in log mode
# 04.06.2011	added verbose option
# 11.11.2011	added -dx scaling opting
$version='04.06.2011';

# This script multiplies a file containing a single column of data by a series of constants, then calculates the SSE against a second row. It then reports the factor that generates the smallest total SSE
# Alternatively, the -dx option will use the scaling formula from Bernado et al. J. Am. Chem. Soc. (2007) to find the best scaling factor to match the original data in it's provided error band

# default argument values
$argcounter			= 0;
$file1				= 'false';
$file2				= 'false';
$dx					= 'false';
$upper				= 100;
$lower				= 0;
$limit				= 0.0001;
$grid_size			= 10;
$max_iterations		= 1000;
$save_file			= 'false';
$use_log			= 'false';
$verbose			= 'false';

#go through the arguments and get their values
while ( $argcounter <= $#ARGV )
{
	if ( $ARGV[ $argcounter ] eq '-dx' )
	{
		$dx = 'true';
	}
	elsif ( $ARGV[ $argcounter ] eq "-upper" )
	{
		$upper = $ARGV[ $argcounter +1 ];
		$argcounter++;
	}
	elsif ( $ARGV[ $argcounter ] eq "-lower" )
	{
		$lower = $ARGV[ $argcounter +1 ];
		$argcounter++;
	}
	elsif ( $ARGV[ $argcounter ] eq "-limit" )
	{
		$limit = $ARGV[ $argcounter +1 ];
		$argcounter++;
	}
	elsif ( $ARGV[ $argcounter ] eq "-grid" )
	{
		$grid_size = $ARGV[ $argcounter +1 ];
		$argcounter++;
	}
	elsif ( $ARGV[ $argcounter ] eq "-save" )
	{
		$save_file = $ARGV[ $argcounter +1 ];
		$argcounter++;
	}
	elsif ( $ARGV[ $argcounter ] eq "-iter" )
	{
		$max_iterations = $ARGV[ $argcounter +1 ];
		$argcounter++;
	}
	elsif ( $ARGV[ $argcounter ] eq "-log" ){
		$use_log = "true";
	}
	elsif ( $ARGV[ $argcounter ] eq "-v" ){
		$verbose = "true";
	}
	elsif ($ARGV[ $argcounter ] eq "-version"){
		print $version."\n";
		exit;
	}
	elsif ($ARGV[ $argcounter ] eq "-h"){
		show_help();
	}
	elsif ($ARGV[ $argcounter ] eq "-help"){
		show_help();
	}
	
	$argcounter++;
}

# files to compare are always last two arguments
$file1 = $ARGV[$#ARGV -1];
$file2 = $ARGV[$#ARGV];

# open and read the file
# m_values = match values
open(SRCFILE, $file1) || die "Can't open file: '$file1:' $!\n";
@m_values = <SRCFILE>;
close(SRCFILE);

@cols = split(/\s+/,$m_values[1]);
if (($cols[2] eq '') && ($dx eq 'true')){
	print "The -dx option requires that the first datafile contains an uncertainty column!\n";
	exit;
}

# s_values = scaled_values (values to scale to match)
open(SRCFILE, $file2) || die "Can't open file: '$file2': $!\n";
@s_values = <SRCFILE>;
close(SRCFILE);

chomp(@m_values);
chomp(@s_values);

# check and make sure that m_values and s_values are the same size
if ($#m_values != $#s_values){
	die "Files to match must have an identical number of rows!\n There are ".$#m_values." rows in ".$file1." and ".$#s_values." rows in ".$file2.".\n";
}

if ($dx eq 'true')
{
	# From: Pelikan et. al., Gen. Physiol. Biophys. (2009)
	# doi: 10.4149/gbp_2009_02_174
	# Original: Bernado et al. J. Am. Chem. Soc. (2007)
	
	$a = 0;
	$b = 0;
	for($i=0; $i< $#m_values; $i++)
	{
		@exp = split("\t",$m_values[ $i ]);
		
		$a += ( $s_values[ $i ] * $exp[1] ) / ( $exp[2]**2 );
		$b += ( $s_values[ $i ] * $s_values[ $i ] ) / ( $exp[2]**2 );
	}

	$scale = $a / $b;
	print("Scaling factor for $file2 to match $file1 within error band: $scale\n");
	exit;
}
else
{
	# initialize starting guesses
	$guess_min = $lower;
	$guess_max = $upper;
	
	# don't exceed max iterations
	for($i=0; $i < $max_iterations; $i++)
	{	
		# subdivide range into grid search
		$step = ($guess_max - $guess_min) / $grid_size;
	
		# starting point chi_sq value
		$new_guess_1 = $guess_min;
		$new_guess_2 = $guess_max;
		$new_chisq_1 = calc_chisq($new_guess_1);
		$new_chisq_2 = calc_chisq($new_guess_2);
	
		if (($verbose eq 'true') && (($i % 100) == 0)){
			print "$i: $guess_max > x < $guess_min\n";
		}
	
		# calculate chisq values for each grid point
		for ($j=$guess_min; $j<$guess_max; $j+=$step)
		{
			# calculate the chi value at the new guess
			$guess_chisq = calc_chisq($j);
	#		print "\t$guess_min > $j < $guess_max: $guess_chisq";		
			
			# if our latest guess was better, replace our previous boundary guesses
			if ($guess_chisq < $new_chisq_1)
			{
	#			print " <1!";
				$new_guess_1 = $j;
				$new_chisq_1 = $guess_chisq;
			}
			elsif($guess_chisq < $new_chisq_2)
			{
	#			print " <2!";
				$new_guess_2 = $j;
				$new_chisq_2 = $guess_chisq;
			}
	#		print "\n";
		}
		
	#	print "$new_guess_1 > x < $new_guess_2, $step\n";
		
		# now that we've found the closest guesses from our previous gridsearch, update
		$guess_min = min($new_guess_1,$new_guess_2) -$step;
		$guess_max = max($new_guess_1,$new_guess_2) +$step;
			
		# are we close enough?
		if (abs($guess_max - $guess_min) < $limit)
		{
			$val = (($guess_max+$guess_min)/2);
			$sig_figs = sprintf('%.0f',-1*log($limit)/log(10));
			$val = sprintf('%.'.$sig_figs.'g',$val);
			print("Converged to within $limit, scaling factor for $file2 to match $file1: $val\n");
			if ($save_file ne "false"){
				save_file(($guess_max+$guess_min)/2);
			}
			exit;
		}
	}
	
	print "Did not converge to within $limit, last fit value=".(($guess_max+$guess_min)/2)."\n";
	if ($save_file ne "false"){
		save_file(($guess_max+$guess_min)/2);
	}
	exit;
}

# calculate chi squared for two sets of values
sub calc_chisq
{
	$sum = 0;
	
	# sum of differences between actual and guess values squared
	for($z=0; $z <= $#m_values; $z++)
	{	
		if ($use_log eq "true"){
			$dev = (safe_log($m_values[$z]) - (safe_log($s_values[$z] * $_[0])))**2;
		}else{
			$dev = ($m_values[$z] - ($s_values[$z] * $_[0]))**2;
		}
	
		$sum += $dev;
	}

	return $sum;
}

# save scaled values to file
sub save_file
{
	open(OUTPUT, ">", $save_file) or die("Can't open '$output_file' for writing: $!");

	for($z=0; $z <= $#m_values; $z++){
		print OUTPUT $m_values[$z]."\t".($s_values[$z]*$_[0])."\n";
	}
	
	close(OUTPUT);
}

sub min
{
	$min = $_[0];
	foreach $test (@_){
		if ($test < $min){
			$min = $test;
		}
	}
	return $min;
}

sub max
{
	$max = $_[0];
	foreach $test (@_){
		if ($test > $max){
			$max = $test;
		}
	}
	return $max;
}

sub show_help
{
	print "tscale (-dx)|(-upper=N -lower=N -limit=N -iter=N -grid=N -save [file] -log) [exp_file] [fit_file]\n";
	exit;
}

sub safe_log
{
	my ($z) = @_;

	if ($z == 0){
		return 0;
	}elsif ($z < 0){
		return log(-1*$z);
	}else{
		return log($z);
	}
}
